#!/bin/bash

#reconstruct Table1 from the SI information

[ -d Table1 ]|| mkdir Table1;

IFS=' '
for col in  1 2 3 4 5 6
do
	fcol=Table1/col$col.txt
	[ -f $fcol ]&& rm -f $fcol;
done

for((p=1;p<=9;p++))
do
	N=$(cat page$p/par2.txt | wc -l)

	tail -n$N page$p/par1.txt | tr ' ' '_' >> Table1/col1.txt

	if [ $p == 9 ]; then
		L=" 2 3 4 5"
	else 
		L=" 2 4 5 6"
	fi

	

	NC=1
	for col in $L
	do
#		echo '<'$col'>'
		NC=$((NC+1))
#		echo N='<'$N'>'
#		echo '<'$(head -n1 page$p/par$col.txt)'>'
		head -n$N page$p/par$col.txt >> Table1/col$NC.txt		
	done
done


# Col1 - Molecule_name
# Col2 - DeltaG_elec (Val+/-Error)
# Col3 - DeltaG_vdw (Val+/-Error)
# Col4 - DeltaG_hydr (Val+-Error)
# Col5 - DeltaG_expt (Val)


# There is such problem:
# Some molecules (e.g m bis trifluoromethyl benzene, N methyl N 222 trifluoroethyl aniline)
# Have two underscores (__) in their names
# pdftotext changes them just to ONE space...
# after change SPACE -> underscore, the resulting names are inconsistent with the other files
# So, we need to do something with it... 
# The simplest thing to do is to check the corresponding prmcrd file... 
./getMolList

tail -n+2 Table1/col2.txt | gawk '{print $1}' > Table1/DeltaG_elec.txt
tail -n+2 Table1/col2.txt | gawk '{print $3}' > Table1/Err_DeltaG_elec.txt
tail -n+2 Table1/col3.txt | gawk '{print $1}' > Table1/DeltaG_vdw.txt
tail -n+2 Table1/col3.txt | gawk '{print $3}' > Table1/Err_DeltaG_vdw.txt
tail -n+2 Table1/col4.txt | gawk '{print $1}' > Table1/DeltaG_hydr.txt
tail -n+2 Table1/col4.txt | gawk '{print $3}' > Table1/Err_DeltaG_hydr.txt
tail -n+2 Table1/col5.txt > Table1/DeltaG_expt.txt

cd Table1

multicol mollist.txt DeltaG_elec.txt Err_DeltaG_elec.txt DeltaG_vdw.txt Err_DeltaG_vdw.txt DeltaG_hydr.txt Err_DeltaG_hydr.txt DeltaG_expt.txt > Table1.txt
cp mollist.txt $MOLDB_PROJECTS/CanadaMol/Prototypes/molecules.lst

echo  DeltaG_elec.txt Err_DeltaG_elec.txt DeltaG_vdw.txt Err_DeltaG_vdw.txt DeltaG_hydr.txt Err_DeltaG_hydr.txt DeltaG_expt.txt > Table1.columns

